#add pipeline to path
import sys
import datetime
#the path to the ribosome profilign pipeline code
pipeline_dir = '/home/boris/analysis_pipelines/ribo_seq/'
if pipeline_dir not in sys.path:
sys.path.append(pipeline_dir)
import ribo_seq_main
import ribo_settings
import ribo_utils
import ribo_lib
import ribo_qc
import ribo_plotting
import ribo_tables
def current_time():
return datetime.datetime.now().strftime("%Y%m%d_%H%M")
def current_date():
return datetime.datetime.now().strftime("%Y%m%d")
## Now import the pickled read count objects
import os
import os.path as path
#note - my code is memory inefficcient, as our server has 1TB of ram, probably 64 or 128 would be sufficient
#perform_qc() code is pretty poorly written and not multithreaded, so it takes hours to run for large datasets
'''
can also launch from command line if this doesn't wok, which is reccommended to get usefeul error messages
eg: python /path to/ribo_seq/ribo_seq_main.py set1_settings.json.txt --threads 48 --perform-qc
if pipeline runs successfuly it won't repeat all calculations, but will just load the mapped read counts positions.
'''
#as many cores as you're willing to use:
threads = 48
set1_settings = ribo_settings.ribo_settings('settings_files/set1_settings.json.txt')
set1_experiment = ribo_seq_main.experiment(set1_settings, threads)
set1_experiment.make_tables()
set1_experiment.perform_qc()
set2_settings = ribo_settings.ribo_settings('settings_files/set2_settings.json.txt')
set2_experiment = ribo_seq_main.experiment(set2_settings, threads)
set2_experiment.make_tables()
set2_experiment.perform_qc()
set3_settings = ribo_settings.ribo_settings('settings_files/set3_settings.json.txt')
set3_experiment = ribo_seq_main.experiment(set3_settings, threads)
set3_experiment.make_tables()
set3_experiment.perform_qc()
set4_settings = ribo_settings.ribo_settings('settings_files/set4_settings.json.txt')
set4_experiment = ribo_seq_main.experiment(set4_settings, threads)
set4_experiment.make_tables()
set4_experiment.perform_qc()
import pandas as pd
summary_set1 = pd.read_csv(os.path.join(set1_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0', 'reads processed'])
summary_set2 = pd.read_csv(os.path.join(set2_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0', 'reads processed'])
summary_set3 = pd.read_csv(os.path.join(set3_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0', 'reads processed'])
summary_set4 = pd.read_csv(os.path.join(set4_experiment.settings.get_rdir(), 'QC', 'consolidated_read_annotation_summary.tsv'), sep='\t').replace(float('NaN'),0).drop(columns=['Unnamed: 0', 'reads processed'])
#code for condensing dataframes
singular_categories = ['sample', '<10 nt', 'rRNA_mapping']
UTR3 = ['multiple mapping_3p_UTR','unique mapping_3p_UTR']
UTR5 = ['multiple mapping_5p_UTR','unique mapping_5p_UTR']
CDS = ['multiple mapping_CDS', 'unique mapping_CDS',]
intron = [ 'unique mapping_sense_intronic','unique mapping_retained_intron', 'unique mapping_protein_coding', 'unique mapping_nonsense_mediated_decay', 'multiple mapping_nonsense_mediated_decay','multiple mapping_protein_coding', 'multiple mapping_retained_intron','multiple mapping_sense_intronic']
combined_cats = singular_categories + UTR3 + UTR5 + CDS + intron
other = ['unique mapping_sense_overlapping', 'unique mapping_snoRNA', 'unique mapping_transcribed_processed_pseudogene', 'unique mapping_transcribed_unitary_pseudogene', 'unique mapping_transcribed_unprocessed_pseudogene', 'unique mapping_translated_processed_pseudogene', 'unique mapping_unitary_pseudogene', 'unique mapping_unprocessed_pseudogene','unique mapping_processed_transcript', 'unique mapping_ribozyme', 'unique mapping_sRNA', 'unique mapping_scaRNA', 'unique mapping_non_stop_decay', 'unique mapping_not annotated', 'unique mapping_processed_pseudogene', 'unique mapping_bidirectional_promoter_lncRNA', 'multiple mapping_non_stop_decay', 'unique mapping_lincRNA', 'unique mapping_miRNA', 'unique mapping_misc_RNA', 'unique mapping_TEC', 'unique mapping_TR_V_pseudogene', 'unique mapping_antisense_RNA','snRNA_mapping', 'tRNA_mapping', 'multiple mapping_transcribed_unprocessed_pseudogene', 'multiple mapping_translated_processed_pseudogene', 'multiple mapping_unitary_pseudogene', 'multiple mapping_unprocessed_pseudogene', 'multiple mapping_ribozyme', 'multiple mapping_scaRNA', 'multiple mapping_sense_overlapping', 'multiple mapping_snoRNA', 'multiple mapping_transcribed_processed_pseudogene', 'multiple mapping_transcribed_unitary_pseudogene','multiple mapping_not annotated', 'multiple mapping_processed_pseudogene', 'multiple mapping_processed_transcript','multiple mapping_IG_C_pseudogene', 'multiple mapping_TEC', 'multiple mapping_antisense_RNA', 'multiple mapping_bidirectional_promoter_lncRNA', 'multiple mapping_lincRNA', 'multiple mapping_miRNA', 'multiple mapping_misc_RNA',]
def condense_df(summary_df):
##collapses read categories and converts to percent
## discards PhiX reads from calculation
condensed_df = pd.DataFrame()
for cat in singular_categories:
condensed_df[cat] = summary_df[cat]
condensed_df['UTR'] = summary_df[UTR5+UTR3].sum(axis=1)
condensed_df['CDS'] = summary_df[CDS].sum(axis=1)
condensed_df['intron'] = summary_df[intron].sum(axis=1)
condensed_df['other'] = summary_df[list(set(summary_df.columns).difference(combined_cats+['NC_001422.1_phi-X174', 'input reads']))].sum(axis=1)
adjusted_total = summary_df['input reads']-summary_df['NC_001422.1_phi-X174']
for column in condensed_df.columns:
if not column == 'sample':
condensed_df[column] = 100.*condensed_df[column].div(adjusted_total, axis=0)
condensed_df['unmapped'] = 100.-condensed_df.sum(axis=1)
condensed_df = condensed_df.rename(columns={'rRNA_mapping': 'rRNA'})
return condensed_df
condensed_df_set1 = condense_df(summary_set1)
condensed_df_set2 = condense_df(summary_set2)
condensed_df_set3 = condense_df(summary_set3)
condensed_df_set4 = condense_df(summary_set4)
#code for stacked bar plots
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
def plot_summary_bar(summary_df, out_name, order=['rRNA', '<10 nt', 'unmapped', 'other', 'intron', 'UTR', 'CDS'], sample_order=None):
if not sample_order==None:
summary_df = summary_df.reindex(sample_order)
ind = np.arange(len(summary_df)) # the x locations for the groups
width = 0.8 # the width of the bars: can also be len(x) sequence
plotLayers = []
bottoms = [0] * len(summary_df)
bottoms = np.array(bottoms)
fig = plt.figure(figsize=(len(summary_df), 4))
num_plots_wide = 1
num_plots_high = 1
plot = fig.add_subplot(num_plots_high, num_plots_wide, 1)
black = (0,0,0)
gray = (127/255.0,127/255.0,127/255.0)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [gray, orange, skyBlue, bluishGreen, blue, reddishPurple, vermillion, yellow]
color_index = 0
for anno_type in order:
plotLayers.append(plot.bar(ind, summary_df[anno_type].values, width, bottom=bottoms, color=colors[color_index], label=anno_type, hatch=None))
color_index += 1
bottoms = bottoms + np.array(summary_df[anno_type].values)
plt.ylabel('percent of reads')
#plt.title('summary of read loss in pipeline')
plot.set_xticks(ind)
plot.set_xticklabels(summary_df['sample'].values, rotation=85)
plt.tight_layout()
# Shrink current axis by 40%
box = plot.get_position()
plot.set_position([box.x0, box.y0, box.width * 0.6, box.height])
# Put a legend to the right of the current axis
plot.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plot.set_ylim(0, 100)
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
# plt.subplots_adjust(bottom=0.38, right=0.8)
plt.savefig(out_name, transparent=True)
plot_summary_bar(condensed_df_set1, 'outputs/figure1B.pdf')
plot_summary_bar(condensed_df_set2, 'outputs/figure1D.pdf', sample_order = [1, 3, 0, 2])
plot_summary_bar(condensed_df_set3, 'outputs/figure2F.pdf')
plot_summary_bar(condensed_df_set4, 'outputs/figure4A.pdf')
#collect frag lengths
import pandas as pd
import numpy as np
def get_frag_lengths(experiment):
dfs = []
for lib in experiment.libs:
frag_dict = lib.get_all_CDS_fragment_length_counts()
frag_lengths = sorted(frag_dict.keys())
frag_length_counts = [frag_dict[length] for length in frag_lengths]
d = {'fragment length':frag_lengths, '# reads':frag_length_counts, '% reads':100.*np.array(frag_length_counts)/sum(frag_length_counts), 'sample':[lib.lib_settings.sample_name]*len(frag_length_counts)}
temp_df = pd.DataFrame(data=d)
dfs.append(temp_df)
return pd.concat(dfs)
set1_frag_lengths = get_frag_lengths(set1_experiment)
set2_frag_lengths = get_frag_lengths(set2_experiment)
set3_frag_lengths = get_frag_lengths(set3_experiment)
set4_frag_lengths = get_frag_lengths(set4_experiment)
set1_frag_lengths.to_csv('outputs/set1_frag_lengths.tsv', sep='\t')
set2_frag_lengths.to_csv('outputs/set2_frag_lengths.tsv', sep='\t')
set3_frag_lengths.to_csv('outputs/set3_frag_lengths.tsv', sep='\t')
set4_frag_lengths.to_csv('outputs/set4_frag_lengths.tsv', sep='\t')
set1_frag_lengths=pd.read_csv('outputs/set1_frag_lengths.tsv', sep='\t')
set2_frag_lengths=pd.read_csv('outputs/set2_frag_lengths.tsv', sep='\t')
set3_frag_lengths=pd.read_csv('outputs/set3_frag_lengths.tsv', sep='\t')
set4_frag_lengths=pd.read_csv('outputs/set4_frag_lengths.tsv', sep='\t')
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
def plot_length_dist(frag_length_df, outname, datasets=None):
if datasets == None:
datasets = frag_length_df['sample'].unique()
end = '5p'
fig = plt.figure(figsize=(5, 5))
plots = []
plot = fig.add_subplot(111)
color_index = 0
group_df=frag_length_df.groupby(['sample'])
for sample in datasets:
df = group_df.get_group(sample)
#df_5p.sort_values(by=['position'], inplace=True)
df.plot(x='fragment length', y='% reads', ax=plot, color = ribo_utils.rainbow[color_index%len(ribo_utils.rainbow)],
linestyle=ribo_utils.line_styles[color_index/len(ribo_utils.rainbow)], lw=2, sharex=True, sharey=True, label = sample)
color_index += 1
#plot.set_title('%s %smer' % (sample, str(fragment_length)), y=0.8, loc='left')
plots.append(plot)
major_xticks=range(12, 60, 3)
#minor_xticks=plot.get_xticks()[::3]
#major_tick_labels=plot.get_xticklabels()[::6]
#plot.set_xticks(minor_xticks, minor=True)
plot.set_xticks(major_xticks, minor=False)
#plot.set_xticklabels(major_tick_labels)
plot.set_ylabel('% CDS-mapping reads', fontsize=20)
plot.set_xlabel('fragment length', fontsize=20)
# Hide the right and top spines
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot.set_xlim(15, 38)
plot.set_ylim(0,25)
plt.savefig(outname, transparent=True)
plot_length_dist(set1_frag_lengths, 'outputs/figure1C.pdf', datasets = None)
plot_length_dist(set2_frag_lengths, 'outputs/figure1E.pdf', datasets = None)
plot_length_dist(set3_frag_lengths, 'outputs/figure1G.pdf', datasets = ['undepleted_1', 'undepleted_2', 'ribozero_p', 'ribozero_p_o', 'ribozero_p_f', 'ribozero_p_of' ])
plot_length_dist(set4_frag_lengths, 'outputs/figure4B.pdf', datasets = None)
#get CDS-mapping footprint counts from pipeline output
set1_cds_counts = pd.read_csv(os.path.join(set1_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set2_cds_counts = pd.read_csv(os.path.join(set2_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set3_cds_counts = pd.read_csv(os.path.join(set3_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set4_cds_counts = pd.read_csv(os.path.join(set4_experiment.settings.get_rdir(), 'tables', 'cds_counts.tsv'), sep = '\t')
set1_cds_counts.to_csv('outputs/set1_CDS_counts.tsv', sep='\t', index=False)
set2_cds_counts.to_csv('outputs/set2_CDS_counts.tsv', sep='\t', index=False)
set3_cds_counts.to_csv('outputs/set3_CDS_counts.tsv', sep='\t', index=False)
set4_cds_counts.to_csv('outputs/set4_CDS_counts.tsv', sep='\t', index=False)
def get_rpm_df(df):
rpm_df = pd.DataFrame()
rpm_df[['tx_id', 'gene_name']] = df[['tx_id', 'gene_name']]
for column in df.columns:
if column not in ['tx_id', 'gene_name']:
rpm_df[column] = df[column]/(sum(df[column])/10.**6)
return rpm_df
#convert counts to reads per million
set1_cds_rpms = get_rpm_df(set1_cds_counts)
set2_cds_rpms = get_rpm_df(set2_cds_counts)
set3_cds_rpms = get_rpm_df(set3_cds_counts)
set4_cds_rpms = get_rpm_df(set4_cds_counts)
set1_cds_rpms.to_csv('outputs/set1_CDS_rpms.tsv', sep='\t', index=False)
set2_cds_rpms.to_csv('outputs/set2_CDS_rpms.tsv', sep='\t', index=False)
set3_cds_rpms.to_csv('outputs/set3_CDS_rpms.tsv', sep='\t', index=False)
set4_cds_rpms.to_csv('outputs/set4_CDS_rpms.tsv', sep='\t', index=False)
#filter for read counts of at least an average of 64 per library, and 1 read per million in at least 1 dataset
set2_cds_counts = set2_cds_counts[set2_cds_counts.mean(axis=1)>=64]
set3_cds_counts = set3_cds_counts[set3_cds_counts.mean(axis=1)>=64]
set4_cds_counts = set4_cds_counts[set4_cds_counts.mean(axis=1)>=64]
set2_cds_rpms = set2_cds_rpms[set2_cds_rpms['tx_id'].isin(set2_cds_counts['tx_id'])]
set3_cds_rpms = set3_cds_rpms[set3_cds_rpms['tx_id'].isin(set3_cds_counts['tx_id'])]
set4_cds_rpms = set4_cds_rpms[set4_cds_rpms['tx_id'].isin(set4_cds_counts['tx_id'])]
set2_cds_rpms = set2_cds_rpms[set2_cds_rpms.min(axis=1)>=1]
set3_cds_rpms = set3_cds_rpms[set3_cds_rpms.min(axis=1)>=1]
set4_cds_rpms = set4_cds_rpms[set4_cds_rpms.min(axis=1)>=1]
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']
df = set2_cds_rpms
pairs = [('undepleted_20_34', 'undepleted_20_60'), ('undepleted_20_34', 'NEBNext_20_34'), ('undepleted_20_60', 'NEBNext_20_60') , ('NEBNext_20_34','NEBNext_20_60')]
plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1
for pair in pairs:
plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
plot.text(50, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
plot.set_ylim(10, 20000)
plot.set_xlim(10, 20000)
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot_index+=1
plot.set_xscale('log')
plot.set_yscale('log')
#plot.set_xticks([10**x for x in range(-2, 2)])
#plot.set_yticks([10**x for x in range(-2, 2)])
plot.yaxis.set_minor_locator(plt.NullLocator())
plot.xaxis.set_minor_locator(plt.NullLocator())
plot.set_aspect('equal', 'box')
#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure2A.pdf', transparent='True', format='pdf')
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']
df = set3_cds_rpms
pairs = [('undepleted_1', 'undepleted_2'), ('undepleted_1', 'ribozero_p'), ('undepleted_1', 'ribozero_p_o') ,
('undepleted_1', 'ribozero_p_f'), ('undepleted_1', 'ribozero_p_of')]
plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1
for pair in pairs:
plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
plot.text(5, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
plot.set_ylim(1, 50000)
plot.set_xlim(1, 50000)
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot_index+=1
plot.set_xscale('log')
plot.set_yscale('log')
#plot.set_xticks([10**x for x in range(-2, 2)])
#plot.set_yticks([10**x for x in range(-2, 2)])
plot.yaxis.set_minor_locator(plt.NullLocator())
plot.xaxis.set_minor_locator(plt.NullLocator())
plot.set_aspect('equal', 'box')
#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure2B.pdf', transparent='True', format='pdf')
#plot read counts
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
from adjustText import adjust_text
import math
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
##plot CDF or mRNA RPKMs
black = (0,0,0)
gray = (0.5,0.5,0.5)
orange = (230/255.0,159/255.0,0)
skyBlue = (86/255.0,180/255.0,233/255.0)
bluishGreen = (0,158/255.0,115/255.0)
yellow = (240/255.0,228/255.0,66/255.0)
blue = (0,114/255.0,178/255.0)
vermillion = (213/255.0,94/255.0,0)
reddishPurple = (204/255.0,121/255.0,167/255.0)
colors = [black, orange, skyBlue, bluishGreen, vermillion, blue, reddishPurple, yellow]
rainbow = [black, vermillion, orange, bluishGreen, blue, reddishPurple, 'violet']
df = set4_cds_rpms
pairs = [('undepleted', 'oligo_depletion')]
plot_index=1
num_plots_wide=len(pairs)
num_plots_high=1
fig = plt.figure(figsize=(2.5*num_plots_wide, 2.5*num_plots_high))
plot_index = 1
for pair in pairs:
plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index)
df.plot.scatter(pair[0], pair[1], color=black, s=2, alpha=0.5, ax=plot)
pearson_r, pearson_p = stats.pearsonr(np.log10(df[pair[0]]), np.log10(df[pair[1]]))
plot.text(5, 15000, 'r2=%.4f (%d)' % (pearson_r**2, len(df[pair[0]])))
plot.set_ylim(1, 50000)
plot.set_xlim(1, 50000)
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot_index+=1
plot.set_xscale('log')
plot.set_yscale('log')
#plot.set_xticks([10**x for x in range(-2, 2)])
#plot.set_yticks([10**x for x in range(-2, 2)])
plot.yaxis.set_minor_locator(plt.NullLocator())
plot.xaxis.set_minor_locator(plt.NullLocator())
plot.set_aspect('equal', 'box')
#lg=plt.legend(loc=4,prop={'size':12}, labelspacing=0.2)
#lg.draw_frame(False)
plt.tight_layout()
plt.savefig('outputs/figure4D.pdf', transparent='True', format='pdf')
First, I need to plot start codon metaplots for short and long footprints to get the offsets for the undepleted samples, then pool them (keeping short and long FP seperate), and get reading frames
import numpy as np
import pandas as pd
def start_meta(experiment, up=200, down=200):
"""
up, down - how far up or down of start to align
"""
positions = range(-1*up,down+1)
frag_lengths = [[f] for f in range(15, 38)]
dfs = []
for lib in experiment.libs:
for frag_length in frag_lengths:
for end in '5p', '3p':
normed_count_sum = np.zeros(down + up + 1)
count_sum = np.zeros(down + up + 1)
inclusion_sum = np.zeros(down + up + 1)
assert len(positions) == len(normed_count_sum)
for transcript in lib.transcripts.values():
if True: #could swap lines if there's a group of transcripts in mind
#if transcript.tx_id in tx_to_use:
#being conservatively broad for normalization
cds_reads = transcript.get_cds_read_count(-30, 30, read_end=end, read_lengths=frag_length)
if cds_reads>=10:
tx_count, tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start, -1*up, down, read_end=end, read_lengths=frag_length)
normed_count_sum += tx_count/(float(cds_reads)/transcript.cds_length)
count_sum += tx_count
inclusion_sum += tx_inclusion
normed_by_inclusion = normed_count_sum/inclusion_sum
d = {'fragment length':[frag_length[0]]*len(positions), 'position':positions, 'reads': count_sum, 'normed reads':normed_count_sum, 'included transcripts':inclusion_sum,
'inclusion normalized reads':normed_by_inclusion, 'end':[end]*len(positions), 'sample':[lib.lib_settings.sample_name]*len(positions)}
temp_df = pd.DataFrame(data=d)
dfs.append(temp_df)
return pd.concat(dfs)
set1_start_meta = start_meta(set1_experiment)
set2_start_meta = start_meta(set2_experiment)
set3_start_meta = start_meta(set3_experiment)
set4_start_meta = start_meta(set4_experiment)
set1_start_meta.to_csv('outputs/set1_start_meta.tsv', sep='\t')
set2_start_meta.to_csv('outputs/set2_start_meta.tsv', sep='\t')
set3_start_meta.to_csv('outputs/set3_start_meta.tsv', sep='\t')
set4_start_meta.to_csv('outputs/set4_start_meta.tsv', sep='\t')
set1_start_meta = pd.read_csv('outputs/set1_start_meta.tsv', sep='\t')
set2_start_meta = pd.read_csv('outputs/set2_start_meta.tsv', sep='\t')
set3_start_meta = pd.read_csv('outputs/set3_start_meta.tsv', sep='\t')
set4_start_meta = pd.read_csv('outputs/set4_start_meta.tsv', sep='\t')
def plot_meta_heatmap(input_df, sample_names, outname, end='5p', xlabel=None, positions=range(-30,30), frag_lengths=range(20, 38)):
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set(style="white", color_codes=True)
sns.set_style("ticks", {"xtick.major.size": 8, "xtick.minor.size": 4, "ytick.major.size": 8})
fig = plt.figure(figsize=(8, 2*len(sample_names)))
num_plots_high=len(sample_names)
num_plots_wide=1
plot_index=0
input_df = input_df[input_df['fragment length'].isin(frag_lengths)]
group_df = input_df.groupby(['sample', 'end'])
for sample in sample_names:
sample_df = group_df.get_group((sample, end))
sample_df = sample_df[sample_df['position'].isin(positions)]
plot = fig.add_subplot(num_plots_high, num_plots_wide, plot_index + 1)
metaplot = sample_df.pivot("fragment length", "position", "reads")
sns.heatmap(metaplot, square=True, robust=False, ax=plot, cmap='gray_r')#The robust parameter sets the scale by the data quartiles, not the max and min
major_xticks=plot.get_xticks()[::3]
minor_xticks=plot.get_xticks()
major_tick_labels=plot.get_xticklabels()[::3]
plot.set_xticklabels(major_tick_labels)
plot.set_xticks(minor_xticks, minor=True)
plot.set_xticks(major_xticks, minor=False)
plot.set_title(sample, fontsize=18)
if not xlabel==None:
plot.set_xlabel(xlabel, fontsize=8)
plot.set_ylabel('fragment length', fontsize=8)
plot_index+=1
plt.tight_layout()
plt.savefig(outname, transparent=True)
outname='outputs/figureS1A.pdf'
plot_meta_heatmap(set1_start_meta, ['1A_noDep', '2A_noDep', '1D_NEB', '2D_NEB'], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
outname='outputs/figureS1B.pdf'
plot_meta_heatmap(set2_start_meta, ['undepleted_20_34', 'undepleted_20_60', 'NEBNext_20_34', 'NEBNext_20_60',], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
outname='outputs/figureS2.pdf'
plot_meta_heatmap(set3_start_meta, ["undepleted_1","undepleted_2","ribozero_p","ribozero_p_o","ribozero_p_f","ribozero_p_of"], outname, end='5p', positions=range(-30,30), xlabel=u'fragment 5΄ end relative to first nt of CDS')
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #leaves most text as actual text in PDFs, not outlines
def plot_start_meta(df, outname, end = '5p', frag_lengths = range(18,34), ylim=(0,10), xlim=(-50, 100)):
if frag_lengths == 'all':
frag_lengths = df['fragment length'].unique()
fig = plt.figure(figsize=(len(df['sample'].unique())*4, len(frag_lengths)*2))
plot_index = 1
plots = []
group_df = df.groupby(['sample', 'fragment length', 'end'])
for fragment_length in frag_lengths:
for sample in df['sample'].unique():
plot = fig.add_subplot(len(frag_lengths), len(df['sample'].unique()), plot_index)
sub_df = group_df.get_group((sample, fragment_length, end))
#df_5p.sort_values(by=['position'], inplace=True)
sub_df.plot(y='inclusion normalized reads', x='position', ax=plot, color = ribo_utils.rainbow[0], lw=2)
plot.set_title('%s %smer' % (sample, str(fragment_length)), y=0.8, loc='left')
plot.vlines([-12,-13, -14], 0, 10, linestyle='dashed')
plots.append(plot)
plot_index += 1
for plot in plots:
plot.set_ylim(ylim[0],ylim[1])
#plot.set_xticks(range(upstream, downstream))
#major_xticks=plot.get_xticks()[::9]
#minor_xticks=plot.get_xticks()[::3]
#major_tick_labels=plot.get_xticklabels()[::6]
#print plot.get_xticklabels()[::6]
#plot.set_xticks(minor_xticks, minor=True)
#plot.set_xticks(major_xticks, minor=False)
#plot.set_xticklabels(major_tick_labels)
plot.set_ylabel('mean normalized reads')
plot.set_xlabel('nt from CDS start')
# Hide the right and top spines
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot.set_xlim(xlim[0], xlim[1])
plot.legend_.remove()
plt.tight_layout()
plt.savefig(outname, transparent=True)
plot_start_meta(set1_start_meta, 'outputs/set1_by_length_start.pdf', end = '5p', frag_lengths = range(20,37), ylim=(0,10), xlim=(-20, 0))
plot_start_meta(set2_start_meta, 'outputs/set2_by_length_start.pdf', end = '5p', frag_lengths = range(20,33), ylim=(0,10), xlim=(-20, 0))
plot_start_meta(set3_start_meta, 'outputs/set3_by_length_start.pdf', end = '5p', frag_lengths = range(20,33), ylim=(0,10), xlim=(-20, 0))
plot_start_meta(set4_start_meta, 'outputs/set4_by_length_start.pdf', end = '5p', frag_lengths = range(18,38), ylim=(0,15), xlim=(-20, 0))
#first need to define the offsets based on the start codon metaplots:
set1_long_offsets = {30:-12, 31:-12, 32:-13, 33:-14, 34:-14, 35:-14}
set2_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set2_long_offsets = {25:-12, 26:-12, 27:-13, 28:-13, 29:-12, 30:-13, 31:-13, 32:-14}
set3_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set3_long_offsets = {25:-12, 26:-12, 27:-13, 28:-13, 29:-12, 30:-13, 31:-13, 32:-14}
set4_short_offsets = {20:-12, 21:-12, 22:-13, 23:-13}
set4_long_offsets = {25:-12, 26:-12, 27:-12, 28:-12, 29:-12, 30:-13, 31:-13, 32:-13}
import numpy as np
import pandas as pd
def start_meta_pooled_offsets(experiment, offsets_5p, up=100, down=200):
"""
up, down - how far up or down of start to align
offsets_5p - a list of offset dictionaries of type {20:-12, 21:-12, 22:-13, 23:-13}, which are the distances of the read 5' end to the first nt of the P site of the
"""
positions = range(-1*up,down+1)
dfs = []
for offset_dict in offsets_5p:
for lib in experiment.libs:
for end in '5p', '3p':
normed_count_sum = np.zeros(down + up + 1)
count_sum = np.zeros(down + up + 1)
inclusion_sum = np.zeros(down + up + 1)
assert len(positions) == len(normed_count_sum)
frag_lengths = offset_dict.keys()
for transcript in lib.transcripts.values():
#being conservatively broad for normalization
cds_reads = transcript.get_cds_read_count(-30, 30, read_end=end, read_lengths=frag_lengths)
if cds_reads>=16:
tx_count, tx_inclusion = np.zeros(down + up + 1), np.zeros(down + up + 1)
for frag_length in frag_lengths:
if end == '5p':
partial_tx_count, partial_tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start+offset_dict[frag_length], (-1*up), down, read_end=end, read_lengths=[frag_length])
if end == '3p':
partial_tx_count, partial_tx_inclusion = transcript.get_CDS_read_counts_array(transcript.cds_start+((frag_length+offset_dict[frag_length])-1), (-1*up), down, read_end=end, read_lengths=[frag_length])
tx_count += partial_tx_count
tx_inclusion += partial_tx_inclusion
tx_inclusion = tx_inclusion/len(frag_lengths)
normed_count_sum += tx_count/(float(cds_reads)/transcript.cds_length)
count_sum += tx_count
inclusion_sum += tx_inclusion
normed_by_inclusion = normed_count_sum/inclusion_sum
d = {'fragment lengths':[','.join(['%d' % x for x in sorted(frag_lengths)])]*len(positions), 'position':positions, 'reads': count_sum, 'normed reads':normed_count_sum, 'included transcripts':inclusion_sum,
'inclusion normalized reads':normed_by_inclusion, 'end':[end]*len(positions), 'sample':[lib.lib_settings.sample_name]*len(positions)}
temp_df = pd.DataFrame(data=d)
dfs.append(temp_df)
return pd.concat(dfs)
set1_start_meta_pooled = start_meta_pooled_offsets(set1_experiment, [set1_long_offsets])
set2_start_meta_pooled = start_meta_pooled_offsets(set2_experiment, [set2_short_offsets, set2_long_offsets])
set3_start_meta_pooled = start_meta_pooled_offsets(set3_experiment, [set3_short_offsets, set3_long_offsets])
set4_start_meta_pooled = start_meta_pooled_offsets(set4_experiment, [set4_short_offsets, set4_long_offsets])
set1_start_meta_pooled.to_csv('outputs/set1_start_meta_pooled.tsv', sep='\t')
set2_start_meta_pooled.to_csv('outputs/set2_start_meta_pooled.tsv', sep='\t')
set3_start_meta_pooled.to_csv('outputs/set3_start_meta_pooled.tsv', sep='\t')
set4_start_meta_pooled.to_csv('outputs/set4_start_meta_pooled.tsv', sep='\t')
def plot_start_meta_pooled(df, outname, end = '5p', frag_lengths = range(18,34), ylim=(0,10), xlim=(-50, 100)):
if frag_lengths == 'all':
frag_lengths = df['fragment lengths'].unique()
fig = plt.figure(figsize=(len(df['sample'].unique())*4, len(frag_lengths)*2))
plot_index = 1
plots = []
group_df = df.groupby(['sample', 'fragment lengths', 'end'])
for fragment_length in frag_lengths:
for sample in df['sample'].unique():
plot = fig.add_subplot(len(frag_lengths), len(df['sample'].unique()), plot_index)
sub_df = group_df.get_group((sample, fragment_length, end))
#df_5p.sort_values(by=['position'], inplace=True)
sub_df.plot(y='inclusion normalized reads', x='position', ax=plot, color = ribo_utils.rainbow[0], lw=1.5)
plot.set_title('%s %smer %d' % (sample, str(fragment_length), sum(sub_df['reads'])), y=0.8, loc='left')
#plot.vlines([-12,-13, -14], 0, 10, linestyle='dashed')
plots.append(plot)
plot_index += 1
for plot in plots:
plot.set_ylim(ylim[0],ylim[1])
#plot.set_xticks(range(upstream, downstream))
#major_xticks=plot.get_xticks()[::9]
#minor_xticks=plot.get_xticks()[::3]
#major_tick_labels=plot.get_xticklabels()[::6]
#print plot.get_xticklabels()[::6]
#plot.set_xticks(minor_xticks, minor=True)
#plot.set_xticks(major_xticks, minor=False)
#plot.set_xticklabels(major_tick_labels)
plot.set_ylabel('mean normalized reads')
plot.set_xlabel('nt from CDS start')
# Hide the right and top spines
plot.spines['right'].set_visible(False)
plot.spines['top'].set_visible(False)
plot.set_xlim(xlim[0], xlim[1])
plot.legend_.remove()
plt.tight_layout()
plt.savefig(outname, transparent=True)
#note: I overlaid the bottom row of plots in illustrator to make figure 3A
plot_start_meta_pooled(set2_start_meta_pooled, 'outputs/figure3A.pdf', end = '5p', frag_lengths ='all', ylim=(0,8), xlim=(-20, 50))
plot_start_meta_pooled(set3_start_meta_pooled, 'outputs/figure3B.pdf', end = '5p', frag_lengths = 'all', ylim=(0,8), xlim=(-20, 50))
#note: I overlaid the rows in illustrator to make figure 4C
plot_start_meta_pooled(set3_start_meta_pooled, 'outputs/figure4C.pdf', end = '5p', frag_lengths = 'all', ylim=(0,12), xlim=(-20, 50))